{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "ARIMA.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "86AaXZAIGKTp",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VSEwsAhmGPoI",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import numpy as np \n",
        "import pandas as pd\n",
        "import matplotlib.dates as mdates\n",
        "import seaborn as sns\n",
        "from sklearn import preprocessing\n",
        "import time\n",
        "from datetime import datetime\n",
        "from tqdm.auto import trange, tqdm\n",
        "import os\n",
        "import math\n",
        "import datetime as dt\n",
        "from numpy import newaxis\n",
        "from keras.layers import Dense, Activation, Dropout, LSTM, GRU, SimpleRNN, BatchNormalization\n",
        "from keras.models import Sequential, load_model, Model\n",
        "from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard\n",
        "import matplotlib.pyplot as plt\n",
        "from keras.optimizers import SGD,Adam\n",
        "from sklearn.preprocessing import MinMaxScaler\n",
        "from keras.preprocessing.sequence import TimeseriesGenerator\n",
        "from keras import backend as K\n",
        "from keras.losses import MeanSquaredLogarithmicError\n",
        "from keras import Input\n",
        "from statsmodels.tsa.stattools import adfuller, acf, pacf\n",
        "from statsmodels.graphics.tsaplots import plot_acf, plot_pacf\n",
        "!pip install pmdarima\n",
        "from pmdarima.arima import ndiffs"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "L-dkl3hWGZQX",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# data path\n",
        "rootPath = '/content/drive/My Drive/Colab Notebooks/NCML/'\n",
        "trainFileName = 'input/train.csv'\n",
        "testFileName = 'input/test.csv'\n",
        "trainWithFlightFileName = 'train_old_population.csv'"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "DlozIP2LGZzO",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def datelist(beginDate, endDate):\n",
        "    # beginDate, endDate are something like ‘20160601’\n",
        "    date_l=[datetime.strftime(x,'%Y-%m-%d') for x in list(pd.date_range(start=beginDate, end=endDate))]\n",
        "    return date_l\n",
        "\n",
        "\n",
        "def date_remove_year(dates):\n",
        "    return [datetime.strftime(datetime.strptime(x, '%Y-%m-%d'),'%m-%d') for x in dates]\n",
        "\n",
        "\n",
        "def dateStrs_to_dateframe(dates):\n",
        "    return [datetime.strptime(x, '%Y-%m-%d') for x in dates]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "EANwczXEGbqq",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def loadDataByCountry(Country, data_all=None, dateRange= ('2020-01-22','2020-04-13'), is_startFromFirstCaseDay=False, daily=False):\n",
        "    confirmed_total_date = data_all[data_all['Country_Region']==Country].groupby(['Date']).agg({'ConfirmedCases':['sum']})\n",
        "    flight_total_date = data_all[data_all['Country_Region']==Country].groupby(['Date']).agg({'DepartureFlight':['mean']})\n",
        "    data = data_all[data_all['Country_Region']==Country].groupby(['Date']).agg({'ConfirmedCases':['sum'],\n",
        "                                                                                'Fatalities':['sum'],\n",
        "                                                                                # 'DepartureFlight':['mean'],\n",
        "                                                                                # 'hospibed':['mean'],\n",
        "                                                                                # 'lung':['mean'],\n",
        "                                                                                # 'total_pop':['mean'],\n",
        "                                                                                # 'density':['mean'],\n",
        "                                                                                # 'age_100+':['mean']\n",
        "                                                                                })\n",
        "    day = flight_total_date.shape[0]\n",
        "#     for i in tqdm(range(day), desc='Complete zero-value in flight data'):\n",
        "#         if flight_total_date.iloc[i].values[0]==0:\n",
        "#             flight_total_date.iloc[i] = int((flight_total_date.iloc[i-1]+flight_total_date.iloc[i+1])/2)\n",
        "    \n",
        "    is_FirstCase = False\n",
        "    FirstCaseDate = dateRange[0]\n",
        "    for i in tqdm(range(day), desc='Complete zero-value in data'):\n",
        "        Date = data.iloc[i].name\n",
        "        if not is_FirstCase:\n",
        "            if data.iloc[i].values[1]!=0:\n",
        "                print('For ', Country, 'The first case during the period just has been found', Date)\n",
        "                FirstCaseDate = Date\n",
        "                is_FirstCase = True\n",
        "            \n",
        "        # if data.iloc[i].values[2]==0:\n",
        "        #     data.loc[Date,'DepartureFlight'] = int((data.iloc[i-1]['DepartureFlight']+data.iloc[i+1]['DepartureFlight'])/2)\n",
        "    \n",
        "    if daily:\n",
        "        data_ConfirmedCases = data.ConfirmedCases.diff()\n",
        "        data_Fatalities = data.Fatalities.diff()\n",
        "        \n",
        "        if not is_startFromFirstCaseDay:\n",
        "            data.ConfirmedCases = data_ConfirmedCases\n",
        "            data.Fatalities = data_Fatalities\n",
        "            return data.iloc[1:].loc[dateRange[0]: dateRange[1]]\n",
        "        else:\n",
        "            data.ConfirmedCases = data_ConfirmedCases\n",
        "            data.Fatalities = data_Fatalities\n",
        "            return data.iloc[1:].loc[FirstCaseDate: dateRange[1]]\n",
        "    \n",
        "    if not is_startFromFirstCaseDay:   \n",
        "        return data.loc[dateRange[0]: dateRange[1]]\n",
        "    else:\n",
        "        return data.loc[FirstCaseDate: dateRange[1]]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "I5lOZ5FjGeuV",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "data_test = pd.read_csv(rootPath+testFileName)\n",
        "data_all = pd.read_csv(rootPath+trainWithFlightFileName)\n",
        "data_all.Province_State.fillna(\"None\", inplace=True)\n",
        "data_all = data_all.drop(['Province_State'], axis=1)\n",
        "dayList =datelist ('20200122','20200413')"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "SUn5pUD7Gh7c",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# ADF test\n",
        "df_adftest = adfuller(train[:,0])\n",
        "# df_adftest = adfuller(train[:,1])\n",
        "df_adftest = pd.DataFrame({'Statistical test': [df_adftest[0]], 'P-Value': [df_adftest[1]], 'usedlag': [df_adftest[2]]})\n",
        "df_adftest"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "u_zXRJR4GlNs",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# split data\n",
        "split = 0.6\n",
        "# 'China', 'US', 'Japan', 'United Kingdom', 'Korea. South', 'Italy'\n",
        "country = 'China'\n",
        "data = loadDataByCountry(Country = country, data_all=data_all, is_startFromFirstCaseDay=True, daily=False)\n",
        "data = data.iloc[6:,:]\n",
        "num_test = int((1-split)/2*(data.shape[0]))\n",
        "num_train = data.shape[0]-num_test\n",
        "indices = np.array(range(data.shape[0]))\n",
        "test = data.values[indices[-num_test:]]\n",
        "train = data.values[indices[:num_train]]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JsxwwdK_G7g7",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# plot ACF and get d, p\n",
        "plt.rcParams.update({'figure.figsize':(12,5), 'figure.dpi':120})\n",
        "# Original Series\n",
        "fig, axes = plt.subplots(3, 2, sharex=True)\n",
        "axes[0, 0].plot(train[:,0]); axes[0, 0].set_title('Original Series')\n",
        "plot_acf(train[:,0], ax=axes[0, 1])\n",
        "# 1st Differencing\n",
        "axes[1, 0].plot(np.diff(train[:,0])); axes[1, 0].set_title('1st Order Differencing')\n",
        "plot_acf(np.diff(train[:,0]), ax=axes[1, 1])\n",
        "# 2nd Differencing\n",
        "axes[2, 0].plot(np.diff(np.diff(train[:,0]))); axes[2, 0].set_title('2nd Order Differencing')\n",
        "plot_acf(np.diff(np.diff(train[:,0])), ax=axes[2, 1])\n",
        "plt.show()\n",
        "\n",
        "# plt.rcParams.update({'figure.figsize':(12,5), 'figure.dpi':120})\n",
        "# # Original Series\n",
        "# fig, axes = plt.subplots(3, 2, sharex=True)\n",
        "# axes[0, 0].plot(train[:,1]); axes[0, 0].set_title('Original Series')\n",
        "# plot_acf(train[:,1], ax=axes[0, 1])\n",
        "# # 1st Differencing\n",
        "# axes[1, 0].plot(np.diff(train[:,1])); axes[1, 0].set_title('1st Order Differencing')\n",
        "# plot_acf(np.diff(train[:,1]), ax=axes[1, 1])\n",
        "# # 2nd Differencing\n",
        "# axes[2, 0].plot(np.diff(np.diff(train[:,1]))); axes[2, 0].set_title('2nd Order Differencing')\n",
        "# plot_acf(np.diff(np.diff(train[:,1])), ax=axes[2, 1])\n",
        "# plt.show()\n",
        "\n",
        "# Adf Test\n",
        "print(ndiffs(train[:,0], test='adf'))\n",
        "# KPSS test\n",
        "print(ndiffs(train[:,0], test='kpss'))\n",
        "# PP test:\n",
        "print(ndiffs(train[:,0], test='pp'))\n",
        "\n",
        "# # # Adf Test\n",
        "# print(ndiffs(train[:,1], test='adf'))\n",
        "# # KPSS test\n",
        "# print(ndiffs(train[:,1], test='kpss'))\n",
        "# # PP test:\n",
        "# print(ndiffs(train[:,1], test='pp'))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7aaEiwGFHJUZ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# plot PACF and get q\n",
        "plt.rcParams.update({'figure.figsize':(12,2), 'figure.dpi':120})\n",
        "fig, axes = plt.subplots(1, 2, sharex=True)\n",
        "axes[0].plot(np.diff(np.diff(train[:,0]))); axes[0].set_title('2nd Differencing')\n",
        "axes[1].set(ylim=(0,5))\n",
        "plot_pacf(np.diff(np.diff(train[:,0])), ax=axes[1])\n",
        "plt.show()\n",
        "\n",
        "# plt.rcParams.update({'figure.figsize':(12,2), 'figure.dpi':120})\n",
        "# fig, axes = plt.subplots(1, 2, sharex=True)\n",
        "# axes[0].plot(np.diff(np.diff(train[:,1]))); axes[0].set_title('2nd Differencing')\n",
        "# axes[1].set(ylim=(0,5))\n",
        "# plot_pacf(np.diff(np.diff(train[:,1])), ax=axes[1])\n",
        "# plt.show()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TBZIHW3PHMlL",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from statsmodels.tsa.arima_model import ARIMA\n",
        "# Build Model\n",
        "model = ARIMA(train[:,0], order=(1, 2, 1)) # p d q\n",
        "# model = ARIMA(train[:,1], order=(1, 2, 1)) # p d q\n",
        "fitted = model.fit(disp=-1)  \n",
        "# Forecast\n",
        "fc, se, conf = fitted.forecast(num_test, alpha=0.05)  # 95% conf\n",
        "\n",
        "indice = date_remove_year(data.index)\n",
        "# Make as pandas series\n",
        "fc_series = pd.Series(fc, index=indice[-num_test:])\n",
        "lower_series = pd.Series(conf[:,0], index=indice[-num_test:])\n",
        "upper_series = pd.Series(conf[:,1], index=indice[-num_test:])\n",
        "train_p = pd.Series(train[:,0], index=indice[:num_train])\n",
        "test_p = pd.Series(test[:,0], index=indice[-num_test:])\n",
        "# train_p = pd.Series(train[:,1], index=indice[:num_train])\n",
        "# test_p = pd.Series(test[:,1], index=indice[-num_test:])\n",
        "\n",
        "# Plot\n",
        "fig, ax = plt.subplots(figsize=(12,5))\n",
        "# ax.figure(figsize=(6,5), dpi=100)\n",
        "ax.plot(train_p, label='training')\n",
        "ax.plot(test_p, label='test')\n",
        "ax.plot(fc_series, label='forecast')\n",
        "ax.fill_between(lower_series.index, lower_series, upper_series, color='k', alpha=.15)\n",
        "ax.set_xticklabels(indice, rotation=45, size=5)\n",
        "ax.set_title(country+' ARIMA New Cases Forecast Result') # Daily or not\n",
        "# ax.set_title(country+' ARIMA Fatalities Forecast Result')\n",
        "ax.legend(loc='upper left', fontsize=8)\n",
        "ax.set_ylabel(\"Number of cases\", size=13)\n",
        "ax.set_xlabel(\"Date\", size=13)\n",
        "fig.show()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "LtjQ7oYmHnzE",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from sklearn.metrics import mean_squared_log_error\n",
        "np.around(mean_squared_log_error(test[:,0],fc), decimals=10)\n",
        "# np.around(mean_squared_log_error(test[:,1],fc), decimals=10)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "YtqZem4SHzBu",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "print(fitted.summary())"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}